/*=======================================================================
Creator: Jingyuan Wang, jingyuanwang@u.northwestern.edu
Date created: 				08/12/2018
Date last modified: 		
Purpose: 

1. Import all the OCEW raw data
2. Clean 
	2.1 keep only state level NAICS31-33 observations
	2.2 make sure we have all the NAICS6 industries we need, if not, keep NAICS5 industries

 
==========================================================================*/


*************************************************************************
* 				PART A. Import data	and append				 			*
*************************************************************************
clear all

* 1. import
forvalues year = 1990(1)2017 {
	tempfile QCEW`year'
	import delimited "$QCEWraw/`year'.annual.singlefile.csv", varn(1) clear 
	
	* some string variables might be all missing and thus be byte in some year
	tostring disclosure_code, replace force
	replace disclosure_code = "" if disclosure_code == "."
	tostring lq_disclosure_code, replace force
	replace lq_disclosure_code = "" if lq_disclosure_code == "."
	tostring oty_disclosure_code, replace force
	replace oty_disclosure_code = "" if oty_disclosure_code == "."
	
	save `QCEW`year'.dta', replace
	disp("`year' DONE")
}
*

* 2. append
use `QCEW1990.dta', clear
forvalues year = 1991(1)2017 {
	append using `QCEW`year'.dta'
}
*

* 3. label all variables
label var area_fips "5-character FIPS code"
label var own_code "1-character ownership code"
			lab def own_code 0	"Total Covered"  5	"Private" 4	"International Government" 3 "Local Government" 2 "State Government" ///
							 1	"Federal Government" 8	"Total Government" 9 "Total U.I. Covered (Excludes Federal Government)"
			lab value own_code own_code
label var industry_code "6-character industry code (NAICS, SuperSector)"
label var agglvl_code "2-character aggregation level code"
label var size_code "1-character size code"
label var year "4-character year"
label var qtr "1-character quarter (always A for annual)"
label var disclosure_code  "1-character disclosure code (either ' '(blank) or 'N' not disclosed)"
label var annual_avg_estabs  "Annual average of quarterly establishment counts for a given year"
label var annual_avg_emplvl "Annual average of monthly employment levels for a given year"
label var total_annual_wages "Sum of the four quarterly total wage levels for a given year"
label var taxable_annual_wages "Sum of the four quarterly total taxable wage totals for a given year"
label var annual_contributions "Sum of the four quarterly contribution totals for a given year"
label var annual_avg_wkly_wage "Average weekly wage based on the 12-monthly employment levels and total annual wage levels"
label var avg_annual_pay "Average annual pay based on employment and wage levels for a given year"
label var lq_disclosure_code "1-character location-quotient disclosure code (either '' (blank) or 'N' not disclosed)"

keep area_fips own_code industry_code agglvl_code size_code ///
	year disclosure_code ///
	annual_avg_estabs annual_avg_emplvl total_annual_wages taxable_annual_wages annual_contributions  

* 4. save
compress
save "$QCEW/raw/QCEW_all.dta", replace

*************************************************************************
* 				PART B. clean dataset 				 					*
*************************************************************************

* 1. keep only total covered
use "$QCEW/raw/QCEW_all.dta", clear

* 2. keep only NAICS31-33 industries
* keep only manufacturing
gen sector = substr(industry_code,1,2)
keep if sector == "31" | sector == "32" | sector == "33"


* 3. keep only state level observations
replace area_fips = "00000" if area_fips == "US000"
destring area_fips, replace force
drop if area_fips == .
* keep only state-level or above
keep if floor(area_fips/1000) * 1000 == area_fips
gen state_code = area_fips/1000
drop area_fips

* 4. clean industry code
* generate numeric code
destring industry_code, gen(NAICS_code) force

* separate aggregated sectors
replace NAICS_code = 99 if industry_code == "31-33"
gen industry_level = strlen(industry_code)
replace industry_level = 2 if industry_code == "31-33"
lab def level 2 "Total" 3 "3-digit sector" 4 "4-digit industry" 5 "5-digit industry" 6 "6-digit industry"
lab value industry_level level

order state_code industry_code NAICS_code sector industry_level agglvl_code size_code year


* 5 collapse observations of different ownership within a industry 
drop size_code agglvl_code
keep if disclosure_code != "N"
drop disclosure_code

bysort state_code industry_code NAICS_code sector industry_level year (total_annual_wages): gen allmissing = missing(total_annual_wages[1])

collapse (sum) annual_avg_estabs annual_avg_emplvl total_annual_wages taxable_annual_wages annual_contributions ///
		(mean) allmissing, by(state_code industry_code NAICS_code sector industry_level year) 

replace total_annual_wages = .   if allmissing==1
drop allmissing
		
label var annual_avg_estabs  "Annual average of quarterly establishment counts for a given year"
label var annual_avg_emplvl "Annual average of monthly employment levels for a given year"
label var total_annual_wages "Sum of the four quarterly total wage levels for a given year"
label var taxable_annual_wages "Sum of the four quarterly total taxable wage totals for a given year"
label var annual_contributions "Sum of the four quarterly contribution totals for a given year"

save "$QCEW/QCEW_state_ind_year.dta", replace

* 6. collapse US level
use "$QCEW/QCEW_state_ind_year.dta", clear

* keep at US level
keep if state_code==0

replace total_annual_wages = total_annual_wages/1000000

label var total_annual_wages "Sum of the four quarterly total wage levels for a given year, in $M"

drop annual_contributions taxable_annual_wages state_code industry_code industry_level sector

rename NAICS_code naics 

save "$QCEW/QCEW_ind_year.dta", replace
